AAQoL machine learning analysis with unbalanced random forest

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above")))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 
rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  # filter(!is.na(Family)) |> 
  na.omit() |>
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 2043
           Frequency of class labels: 991, 1052
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 576.0193
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1291
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0616
                   (OOB) Brier score: 0.24879713
        (OOB) Normalized Brier score: 0.99518853
                           (OOB) AUC: 0.6268316
                        (OOB) PR-AUC: 0.5798947
                        (OOB) G-mean: 0.58322112
   (OOB) Requested performance error: 0.41677888

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  593 398      0.4016
       Yes 454 598      0.4316

      (OOB) Misclassification rate: 0.4170338
print(rfobj)
                         Sample size: 2043
           Frequency of class labels: 991, 1052
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 576.0193
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1291
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0616
                   (OOB) Brier score: 0.24879713
        (OOB) Normalized Brier score: 0.99518853
                           (OOB) AUC: 0.6268316
                        (OOB) PR-AUC: 0.5798947
                        (OOB) G-mean: 0.58322112
   (OOB) Requested performance error: 0.41677888

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  593 398      0.4016
       Yes 454 598      0.4316

      (OOB) Misclassification rate: 0.4170338
plot(rfobj,plots.one.page = FALSE)


                       all   No   Yes
Age                 0.0309   NA    NA
Health.Insurance    0.0069   NA    NA
Ethnicity           0.0061   NA    NA
EnglishSpeak        0.0049   NA    NA
Discrimination      0.0006   NA    NA
Dental.Insurance   -0.0022   NA    NA
Employment         -0.0029   NA    NA
Income_median      -0.0059   NA    NA
Gender             -0.0063   NA    NA
EnglishDiff        -0.0071   NA    NA
Religion           -0.0191   NA    NA
rfobj$importance
                           all No Yes
Ethnicity         0.0060708045 NA  NA
Age               0.0308768777 NA  NA
Gender           -0.0063052764 NA  NA
Religion         -0.0190960623 NA  NA
Employment       -0.0029185559 NA  NA
Income_median    -0.0058812711 NA  NA
EnglishSpeak      0.0048920137 NA  NA
EnglishDiff      -0.0071158157 NA  NA
Health.Insurance  0.0068973926 NA  NA
Dental.Insurance -0.0021882329 NA  NA
Discrimination    0.0006458885 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")

set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Family~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=train, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1635
           Frequency of class labels: 803, 832
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 360.381
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1033
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0361
                   (OOB) Brier score: 0.18045444
        (OOB) Normalized Brier score: 0.72181774
                           (OOB) AUC: 0.80763094
                        (OOB) PR-AUC: 0.79597193
                        (OOB) G-mean: 0.73795692
   (OOB) Requested performance error: 0.26204308

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 584 219      0.2727
       No  209 623      0.2512

      (OOB) Misclassification rate: 0.2617737
plot(rfobj,plots.one.page = FALSE)


                      all   Yes   No
EnglishDiff        0.0719    NA   NA
Ethnicity          0.0515    NA   NA
Employment         0.0498    NA   NA
Religion           0.0412    NA   NA
EnglishSpeak       0.0353    NA   NA
Dental.Insurance   0.0296    NA   NA
Age                0.0249    NA   NA
Income_median      0.0184    NA   NA
Gender             0.0167    NA   NA
Health.Insurance   0.0128    NA   NA
Discrimination     0.0076    NA   NA
rfobj$importance
                         all Yes No
Ethnicity        0.051456689  NA NA
Age              0.024930992  NA NA
Gender           0.016728281  NA NA
Religion         0.041167460  NA NA
Employment       0.049775540  NA NA
Income_median    0.018418546  NA NA
EnglishSpeak     0.035258321  NA NA
EnglishDiff      0.071887244  NA NA
Health.Insurance 0.012794833  NA NA
Dental.Insurance 0.029610793  NA NA
Discrimination   0.007621102  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
210.0000000 198.0000000   1.0606061   0.4852941   0.6010101   0.5047619 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5336323   0.5729730   0.4485294   0.2510778   1.0043113   0.6057961 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5653207   0.5506433   0.4852941   0.5777928   0.5580541   0.5507155 
      gmean 
  0.5507876 
test_rf$importance
                          all Yes No
Ethnicity         0.006328172  NA NA
Age               0.019448825  NA NA
Gender            0.006508259  NA NA
Religion         -0.004444357  NA NA
Employment        0.004840415  NA NA
Income_median     0.002827600  NA NA
EnglishSpeak      0.010967159  NA NA
EnglishDiff      -0.003174699  NA NA
Health.Insurance  0.004588388  NA NA
Dental.Insurance  0.004718407  NA NA
Discrimination    0.008131573  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Source of Information: Health Professionals

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728
rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imbalanced(Heal.Professionals ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")->rfobj

print(rfobj)
                         Sample size: 2044
           Frequency of class labels: 990, 1054
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 570.5297
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1292
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0646
                   (OOB) Brier score: 0.25053189
        (OOB) Normalized Brier score: 1.00212757
                           (OOB) AUC: 0.62961877
                        (OOB) PR-AUC: 0.5837091
                        (OOB) G-mean: 0.59764134
   (OOB) Requested performance error: 0.40235866

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  607 383      0.3869
       Yes 440 614      0.4175

      (OOB) Misclassification rate: 0.4026419
plot(rfobj,plots.one.page = FALSE)


                       all   No   Yes
EnglishSpeak        0.0180   NA    NA
Discrimination      0.0124   NA    NA
Dental.Insurance    0.0113   NA    NA
Health.Insurance    0.0039   NA    NA
EnglishDiff         0.0035   NA    NA
Income_median       0.0015   NA    NA
Gender             -0.0052   NA    NA
Ethnicity          -0.0069   NA    NA
Religion           -0.0099   NA    NA
Employment         -0.0112   NA    NA
Age                -0.0130   NA    NA
rfobj$importance
                          all No Yes
Ethnicity        -0.006852786 NA  NA
Age              -0.013024851 NA  NA
Gender           -0.005190613 NA  NA
Religion         -0.009945867 NA  NA
Employment       -0.011239457 NA  NA
Income_median     0.001458609 NA  NA
EnglishSpeak      0.017958624 NA  NA
EnglishDiff       0.003497160 NA  NA
Health.Insurance  0.003911037 NA  NA
Dental.Insurance  0.011261219 NA  NA
Discrimination    0.012365445 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(Heal.Professionals=="Yes")
neg <- rfdata |> filter(Heal.Professionals==0)

set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Heal.Professionals~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Heal.Professionals ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1636
           Frequency of class labels: 804, 832
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 341.4653
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1034
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0348
                   (OOB) Brier score: 0.157625
        (OOB) Normalized Brier score: 0.6305
                           (OOB) AUC: 0.85729107
                        (OOB) PR-AUC: 0.84956315
                        (OOB) G-mean: 0.78549864
   (OOB) Requested performance error: 0.21450136

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 634 170      0.2114
       No  181 651      0.2175

      (OOB) Misclassification rate: 0.2145477
print(rfobj)
                         Sample size: 1636
           Frequency of class labels: 804, 832
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 341.4653
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1034
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0348
                   (OOB) Brier score: 0.157625
        (OOB) Normalized Brier score: 0.6305
                           (OOB) AUC: 0.85729107
                        (OOB) PR-AUC: 0.84956315
                        (OOB) G-mean: 0.78549864
   (OOB) Requested performance error: 0.21450136

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 634 170      0.2114
       No  181 651      0.2175

      (OOB) Misclassification rate: 0.2145477
plot(rfobj,plots.one.page = FALSE)


                      all   Yes   No
Religion           0.0674    NA   NA
Ethnicity          0.0654    NA   NA
EnglishSpeak       0.0537    NA   NA
Age                0.0464    NA   NA
EnglishDiff        0.0312    NA   NA
Income_median      0.0296    NA   NA
Discrimination     0.0281    NA   NA
Dental.Insurance   0.0269    NA   NA
Gender             0.0220    NA   NA
Health.Insurance   0.0201    NA   NA
Employment         0.0162    NA   NA
rfobj$importance
                        all Yes No
Ethnicity        0.06541069  NA NA
Age              0.04640521  NA NA
Gender           0.02195136  NA NA
Religion         0.06738251  NA NA
Employment       0.01623423  NA NA
Income_median    0.02957817  NA NA
EnglishSpeak     0.05370562  NA NA
EnglishDiff      0.03115796  NA NA
Health.Insurance 0.02010926  NA NA
Dental.Insurance 0.02685986  NA NA
Discrimination   0.02805890  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
210.0000000 198.0000000   1.0606061   0.4852941   0.5656566   0.5285714 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5308057   0.5634518   0.4534314   0.2698256   1.0793025   0.5750722 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5476773   0.5465636   0.4852941   0.5507492   0.5472385   0.5466817 
      gmean 
  0.5467997 
test_rf$importance
                           all Yes No
Ethnicity        -0.0221623422  NA NA
Age               0.0004597253  NA NA
Gender           -0.0031441717  NA NA
Religion         -0.0018772342  NA NA
Employment       -0.0100570837  NA NA
Income_median    -0.0023044699  NA NA
EnglishSpeak      0.0036238669  NA NA
EnglishDiff      -0.0002706752  NA NA
Health.Insurance  0.0024200857  NA NA
Dental.Insurance  0.0060652123  NA NA
Discrimination   -0.0004590801  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 1936
           Frequency of class labels: 259, 1677
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 232.2483
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1224
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.4749
                   (OOB) Brier score: 0.10507578
        (OOB) Normalized Brier score: 0.42030312
                           (OOB) AUC: 0.73686349
                        (OOB) PR-AUC: 0.31633622
                        (OOB) G-mean: 0.66187789
   (OOB) Requested performance error: 0.33812211

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   198  61      0.2355
       Yes 716 961      0.4270

      (OOB) Misclassification rate: 0.401343
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1677.0000000  259.0000000    6.4749035    0.1337810    0.7644788    0.5730471 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2166302    0.9403131    0.4013430    0.1050758    0.4203031    0.7368635 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3375959    0.4580445    0.1337810    0.3163362    0.4997369    0.5599612 
       gmean 
   0.6618779 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
# 
# 
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1550
           Frequency of class labels: 760, 790
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 206.605
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 980
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0395
                   (OOB) Brier score: 0.07716915
        (OOB) Normalized Brier score: 0.30867661
                           (OOB) AUC: 0.98931046
                        (OOB) PR-AUC: 0.98844527
                        (OOB) G-mean: 0.93960883
   (OOB) Requested performance error: 0.06039117

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 692  68      0.0895
       0    24 766      0.0304

      (OOB) Misclassification rate: 0.05935484
print(rfobj)
                         Sample size: 1550
           Frequency of class labels: 760, 790
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 206.605
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 980
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0395
                   (OOB) Brier score: 0.07716915
        (OOB) Normalized Brier score: 0.30867661
                           (OOB) AUC: 0.98931046
                        (OOB) PR-AUC: 0.98844527
                        (OOB) G-mean: 0.93960883
   (OOB) Requested performance error: 0.06039117

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 692  68      0.0895
       0    24 766      0.0304

      (OOB) Misclassification rate: 0.05935484
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Religion                  0.0111    NA   NA
EnglishDiff               0.0100    NA   NA
EnglishSpeak              0.0088    NA   NA
Ethnicity                 0.0085    NA   NA
Community.Shares.Values   0.0076    NA   NA
Income_median             0.0068    NA   NA
Helpful.Friends           0.0051    NA   NA
Community.Trust           0.0049    NA   NA
Close.Friends             0.0044    NA   NA
Helpful.Family            0.0039    NA   NA
Successful.Family         0.0037    NA   NA
Religious.Importance      0.0037    NA   NA
Close.Family              0.0036    NA   NA
Get.Along                 0.0035    NA   NA
Close.knit.Community      0.0023    NA   NA
Togetherness              0.0018    NA   NA
Similar.Values            0.0018    NA   NA
Helpful.Community         0.0013    NA   NA
Family.Pride              0.0011    NA   NA
Family.Respect            0.0011    NA   NA
See.Friends               0.0010    NA   NA
See.Family                0.0010    NA   NA
Discrimination            0.0006    NA   NA
Gender                    0.0005    NA   NA
Employment                0.0004    NA   NA
Spend.Time.Together       0.0004    NA   NA
rfobj$importance
                                  all Yes  0
Ethnicity                0.0084731502  NA NA
Age                     -0.0013568366  NA NA
Gender                   0.0004875932  NA NA
Religion                 0.0110560906  NA NA
Employment               0.0004326155  NA NA
Income_median            0.0068200365  NA NA
EnglishSpeak             0.0088425976  NA NA
EnglishDiff              0.0099724240  NA NA
See.Family               0.0009860864  NA NA
Close.Family             0.0036275503  NA NA
Helpful.Family           0.0038767101  NA NA
See.Friends              0.0009860864  NA NA
Close.Friends            0.0044259859  NA NA
Helpful.Friends          0.0051039003  NA NA
Family.Respect           0.0011032086  NA NA
Similar.Values           0.0017796091  NA NA
Successful.Family        0.0037485625  NA NA
Trust                   -0.0008646339  NA NA
Loyalty                 -0.0015400189  NA NA
Family.Pride             0.0011032086  NA NA
Expression               0.0003687744  NA NA
Spend.Time.Together      0.0004272952  NA NA
Feel.Close              -0.0001887636  NA NA
Togetherness             0.0017796091  NA NA
Religious.Attendance    -0.0011390952  NA NA
Religious.Importance     0.0036871646  NA NA
Close.knit.Community     0.0022779010  NA NA
Helpful.Community        0.0013463726  NA NA
Community.Shares.Values  0.0076247813  NA NA
Get.Along                0.0034594049  NA NA
Community.Trust          0.0049194518  NA NA
Discrimination           0.0006135220  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
335.0000000  51.0000000   6.5686275   0.1321244   0.9607843   0.1283582 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1436950   0.9555556   0.7616580   0.1669358   0.6677432   0.6633597 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.2500000   0.2375691   0.1321244   0.2230538   0.3005880   0.2943725 
      gmean 
  0.3511760 
test_rf$importance
                                  all Yes  0
Ethnicity                0.0185811541  NA NA
Age                      0.0046936485  NA NA
Gender                  -0.0049165067  NA NA
Religion                 0.0049522296  NA NA
Employment               0.0249069777  NA NA
Income_median            0.0384678821  NA NA
EnglishSpeak             0.0477676308  NA NA
EnglishDiff              0.0068514128  NA NA
See.Family              -0.0026200259  NA NA
Close.Family             0.0014578439  NA NA
Helpful.Family          -0.0090863643  NA NA
See.Friends             -0.0033656426  NA NA
Close.Friends           -0.0005776834  NA NA
Helpful.Friends         -0.0001471166  NA NA
Family.Respect           0.0026461124  NA NA
Similar.Values           0.0004857335  NA NA
Successful.Family        0.0010069939  NA NA
Trust                   -0.0003749214  NA NA
Loyalty                  0.0021279686  NA NA
Family.Pride             0.0042392685  NA NA
Expression               0.0035005613  NA NA
Spend.Time.Together      0.0004593727  NA NA
Feel.Close               0.0011686196  NA NA
Togetherness             0.0141334482  NA NA
Religious.Attendance     0.0138012802  NA NA
Religious.Importance     0.0135173515  NA NA
Close.knit.Community     0.0017685922  NA NA
Helpful.Community        0.0146127717  NA NA
Community.Shares.Values  0.0049706964  NA NA
Get.Along                0.0117843509  NA NA
Community.Trust          0.0121515197  NA NA
Discrimination          -0.0003562717  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 1932
           Frequency of class labels: 760, 1172
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 353.968
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1221
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.5421
                   (OOB) Brier score: 0.17800194
        (OOB) Normalized Brier score: 0.71200778
                           (OOB) AUC: 0.79767323
                        (OOB) PR-AUC: 0.71118357
                        (OOB) G-mean: 0.73529229
   (OOB) Requested performance error: 0.26470771

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   588 172      0.2263
       Yes 353 819      0.3012

      (OOB) Misclassification rate: 0.2717391
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1172.0000000  760.0000000    1.5421053    0.3933747    0.7736842    0.6988055 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6248672    0.8264379    0.2717391    0.1780019    0.7120078    0.7976732 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6913580    0.7228198    0.3933747    0.7111836    0.7133252    0.7290560 
       gmean 
   0.7352923 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1546
           Frequency of class labels: 757, 789
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 227.409
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 977
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0423
                   (OOB) Brier score: 0.11458659
        (OOB) Normalized Brier score: 0.45834637
                           (OOB) AUC: 0.9374595
                        (OOB) PR-AUC: 0.93163041
                        (OOB) G-mean: 0.86901707
   (OOB) Requested performance error: 0.13098293

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 649 108      0.1427
       0    94 695      0.1191

      (OOB) Misclassification rate: 0.1306598
print(rfobj)
                         Sample size: 1546
           Frequency of class labels: 757, 789
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 227.409
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 977
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0423
                   (OOB) Brier score: 0.11458659
        (OOB) Normalized Brier score: 0.45834637
                           (OOB) AUC: 0.9374595
                        (OOB) PR-AUC: 0.93163041
                        (OOB) G-mean: 0.86901707
   (OOB) Requested performance error: 0.13098293

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 649 108      0.1427
       0    94 695      0.1191

      (OOB) Misclassification rate: 0.1306598
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Income_median             0.0322    NA   NA
Employment                0.0277    NA   NA
EnglishSpeak              0.0256    NA   NA
Religion                  0.0197    NA   NA
EnglishDiff               0.0185    NA   NA
Ethnicity                 0.0168    NA   NA
Religious.Attendance      0.0111    NA   NA
Age                       0.0092    NA   NA
Religious.Importance      0.0086    NA   NA
Community.Shares.Values   0.0086    NA   NA
Close.knit.Community      0.0085    NA   NA
Helpful.Family            0.0075    NA   NA
Get.Along                 0.0074    NA   NA
Expression                0.0071    NA   NA
Successful.Family         0.0066    NA   NA
Trust                     0.0059    NA   NA
Helpful.Community         0.0053    NA   NA
Loyalty                   0.0053    NA   NA
Similar.Values            0.0052    NA   NA
Spend.Time.Together       0.0052    NA   NA
Community.Trust           0.0046    NA   NA
Family.Respect            0.0046    NA   NA
Close.Friends             0.0033    NA   NA
Togetherness              0.0033    NA   NA
Family.Pride              0.0033    NA   NA
Helpful.Friends           0.0033    NA   NA
rfobj$importance
                                 all Yes  0
Ethnicity               0.0168312534  NA NA
Age                     0.0092060454  NA NA
Gender                  0.0001416190  NA NA
Religion                0.0197281929  NA NA
Employment              0.0276506891  NA NA
Income_median           0.0322000726  NA NA
EnglishSpeak            0.0256452206  NA NA
EnglishDiff             0.0184910635  NA NA
See.Family              0.0026821579  NA NA
Close.Family            0.0026821579  NA NA
Helpful.Family          0.0074523783  NA NA
See.Friends             0.0008143837  NA NA
Close.Friends           0.0033056447  NA NA
Helpful.Friends         0.0032592303  NA NA
Family.Respect          0.0045539676  NA NA
Similar.Values          0.0052243540  NA NA
Successful.Family       0.0065666900  NA NA
Trust                   0.0059437572  NA NA
Loyalty                 0.0053212736  NA NA
Family.Pride            0.0032592303  NA NA
Expression              0.0070987707  NA NA
Spend.Time.Together     0.0051788055  NA NA
Feel.Close              0.0020108405  NA NA
Togetherness            0.0032592303  NA NA
Religious.Attendance    0.0110745251  NA NA
Religious.Importance    0.0086366602  NA NA
Close.knit.Community    0.0085335299  NA NA
Helpful.Community       0.0053212736  NA NA
Community.Shares.Values 0.0085841206  NA NA
Get.Along               0.0073960245  NA NA
Community.Trust         0.0046004516  NA NA
Discrimination          0.0021093322  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
234.0000000 152.0000000   1.5394737   0.3937824   0.8618421   0.5512821 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5550847   0.8600000   0.3264249   0.1880273   0.7521092   0.7908232 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.6752577   0.6735621   0.3937824   0.7055791   0.6822729   0.6814251 
      gmean 
  0.6892881 
test_rf$importance
                                  all Yes  0
Ethnicity                9.178649e-03  NA NA
Age                      8.612435e-03  NA NA
Gender                   1.066123e-03  NA NA
Religion                 4.104232e-04  NA NA
Employment               3.012834e-02  NA NA
Income_median            6.680711e-02  NA NA
EnglishSpeak             1.587445e-02  NA NA
EnglishDiff              5.340320e-03  NA NA
See.Family              -2.620456e-03  NA NA
Close.Family             4.712252e-04  NA NA
Helpful.Family           2.130361e-03  NA NA
See.Friends             -2.915042e-03  NA NA
Close.Friends           -2.087536e-03  NA NA
Helpful.Friends          5.657160e-04  NA NA
Family.Respect           6.766214e-04  NA NA
Similar.Values          -1.157392e-03  NA NA
Successful.Family        1.433907e-05  NA NA
Trust                    4.245357e-04  NA NA
Loyalty                  3.202822e-03  NA NA
Family.Pride             1.941612e-03  NA NA
Expression               3.678510e-03  NA NA
Spend.Time.Together     -2.102017e-04  NA NA
Feel.Close               1.327715e-04  NA NA
Togetherness             1.130552e-03  NA NA
Religious.Attendance    -6.206760e-04  NA NA
Religious.Importance     4.387678e-04  NA NA
Close.knit.Community    -1.151549e-03  NA NA
Helpful.Community        1.614995e-04  NA NA
Community.Shares.Values -8.626769e-04  NA NA
Get.Along                1.932312e-04  NA NA
Community.Trust          7.894886e-04  NA NA
Discrimination          -6.112475e-04  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2032
           Frequency of class labels: 652, 1380
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 457.723
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1284
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 2.1166
                   (OOB) Brier score: 0.19394941
        (OOB) Normalized Brier score: 0.77579766
                           (OOB) AUC: 0.72411421
                        (OOB) PR-AUC: 0.53617914
                        (OOB) G-mean: 0.66633872
   (OOB) Requested performance error: 0.33366128

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   425 227      0.3482
       Yes 440 940      0.3188

      (OOB) Misclassification rate: 0.328248
plot(imb,plots.one.page = F)


                       all    0   Yes
Health.Insurance    0.0312   NA    NA
Age                 0.0124   NA    NA
Gender              0.0032   NA    NA
Employment          0.0016   NA    NA
EnglishSpeak        0.0014   NA    NA
Income_median       0.0008   NA    NA
Dental.Insurance   -0.0013   NA    NA
EnglishDiff        -0.0053   NA    NA
Discrimination     -0.0073   NA    NA
Religion           -0.0124   NA    NA
Ethnicity          -0.0191   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1380.0000000  652.0000000    2.1165644    0.3208661    0.6518405    0.6811594 
        prec          npv     misclass        brier   brier.norm          auc 
   0.4913295    0.8054841    0.3282480    0.1939494    0.7757977    0.7241142 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.5603164    0.6370455    0.3208661    0.5361791    0.6133276    0.6516921 
       gmean 
   0.6663387 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1626
           Frequency of class labels: 800, 826
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 307.8227
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1028
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0325
                   (OOB) Brier score: 0.1428173
        (OOB) Normalized Brier score: 0.57126919
                           (OOB) AUC: 0.88234262
                        (OOB) PR-AUC: 0.87509222
                        (OOB) G-mean: 0.80314153
   (OOB) Requested performance error: 0.19685847

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 640 160      0.2000
       0   160 666      0.1937

      (OOB) Misclassification rate: 0.196802
print(rfobj)
                         Sample size: 1626
           Frequency of class labels: 800, 826
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 307.8227
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1028
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0325
                   (OOB) Brier score: 0.1428173
        (OOB) Normalized Brier score: 0.57126919
                           (OOB) AUC: 0.88234262
                        (OOB) PR-AUC: 0.87509222
                        (OOB) G-mean: 0.80314153
   (OOB) Requested performance error: 0.19685847

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 640 160      0.2000
       0   160 666      0.1937

      (OOB) Misclassification rate: 0.196802
plot(rfobj,plots.one.page = FALSE)


                      all   Yes    0
Ethnicity          0.0597    NA   NA
Religion           0.0521    NA   NA
Health.Insurance   0.0520    NA   NA
EnglishDiff        0.0436    NA   NA
EnglishSpeak       0.0412    NA   NA
Age                0.0375    NA   NA
Gender             0.0258    NA   NA
Employment         0.0246    NA   NA
Income_median      0.0245    NA   NA
Discrimination     0.0156    NA   NA
Dental.Insurance   0.0153    NA   NA
rfobj$importance
                        all Yes  0
Ethnicity        0.05968643  NA NA
Age              0.03746991  NA NA
Gender           0.02577169  NA NA
Religion         0.05212527  NA NA
Employment       0.02460842  NA NA
Income_median    0.02445293  NA NA
EnglishSpeak     0.04115996  NA NA
EnglishDiff      0.04355390  NA NA
Health.Insurance 0.05199531  NA NA
Dental.Insurance 0.01530459  NA NA
Discrimination   0.01562159  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
276.0000000 130.0000000   2.1230769   0.3201970   0.8384615   0.4347826 
       prec         npv    misclass       brier  brier.norm         auc 
  0.4113208   0.8510638   0.4359606   0.2073340   0.8293359   0.7284281 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5518987   0.5634713   0.3201970   0.5663324   0.5778386   0.5836249 
      gmean 
  0.6037785 
test_rf$importance
                          all Yes  0
Ethnicity         0.014364805  NA NA
Age               0.039086387  NA NA
Gender            0.019297506  NA NA
Religion         -0.010370314  NA NA
Employment        0.004825927  NA NA
Income_median     0.001628219  NA NA
EnglishSpeak      0.003351993  NA NA
EnglishDiff       0.014527465  NA NA
Health.Insurance  0.059401232  NA NA
Dental.Insurance  0.022944603  NA NA
Discrimination    0.005618560  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2026
           Frequency of class labels: 840, 1186
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 486.4383
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1280
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.4119
                   (OOB) Brier score: 0.21211144
        (OOB) Normalized Brier score: 0.84844575
                           (OOB) AUC: 0.73064021
                        (OOB) PR-AUC: 0.61408391
                        (OOB) G-mean: 0.67786705
   (OOB) Requested performance error: 0.32213295

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   578 262      0.3119
       Yes 394 792      0.3322

      (OOB) Misclassification rate: 0.3237907
plot(imb,plots.one.page = F)


                       all    0   Yes
Dental.Insurance    0.0218   NA    NA
Health.Insurance    0.0085   NA    NA
EnglishSpeak       -0.0012   NA    NA
Employment         -0.0015   NA    NA
Discrimination     -0.0043   NA    NA
Religion           -0.0096   NA    NA
EnglishDiff        -0.0102   NA    NA
Ethnicity          -0.0104   NA    NA
Income_median      -0.0121   NA    NA
Age                -0.0152   NA    NA
Gender             -0.0167   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1186.0000000  840.0000000    1.4119048    0.4146101    0.6880952    0.6677909 
        prec          npv     misclass        brier   brier.norm          auc 
   0.5946502    0.7514231    0.3237907    0.2121114    0.8484457    0.7306402 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6379691    0.6707773    0.4146101    0.6140839    0.6579181    0.6743222 
       gmean 
   0.6778670 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1621
           Frequency of class labels: 798, 823
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 303.016
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1024
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0313
                   (OOB) Brier score: 0.14581753
        (OOB) Normalized Brier score: 0.58327014
                           (OOB) AUC: 0.87291436
                        (OOB) PR-AUC: 0.86568121
                        (OOB) G-mean: 0.78885027
   (OOB) Requested performance error: 0.21114973

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 623 175      0.2193
       0   167 656      0.2029

      (OOB) Misclassification rate: 0.2109809
print(rfobj)
                         Sample size: 1621
           Frequency of class labels: 798, 823
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 303.016
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1024
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0313
                   (OOB) Brier score: 0.14581753
        (OOB) Normalized Brier score: 0.58327014
                           (OOB) AUC: 0.87291436
                        (OOB) PR-AUC: 0.86568121
                        (OOB) G-mean: 0.78885027
   (OOB) Requested performance error: 0.21114973

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 623 175      0.2193
       0   167 656      0.2029

      (OOB) Misclassification rate: 0.2109809
plot(rfobj,plots.one.page = FALSE)


                       all   Yes    0
Dental.Insurance    0.0442    NA   NA
Religion            0.0337    NA   NA
Ethnicity           0.0331    NA   NA
Income_median       0.0297    NA   NA
EnglishSpeak        0.0290    NA   NA
EnglishDiff         0.0282    NA   NA
Age                 0.0210    NA   NA
Gender              0.0133    NA   NA
Health.Insurance    0.0127    NA   NA
Employment          0.0011    NA   NA
Discrimination     -0.0051    NA   NA
rfobj$importance
                          all Yes  0
Ethnicity         0.033144943  NA NA
Age               0.021018090  NA NA
Gender            0.013311024  NA NA
Religion          0.033707298  NA NA
Employment        0.001147374  NA NA
Income_median     0.029733426  NA NA
EnglishSpeak      0.029024705  NA NA
EnglishDiff       0.028190525  NA NA
Health.Insurance  0.012714401  NA NA
Dental.Insurance  0.044202800  NA NA
Discrimination   -0.005144584  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
237.0000000 168.0000000   1.4107143   0.4148148   0.7202381   0.6075949 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5654206   0.7539267   0.3456790   0.2195652   0.8782608   0.7242817 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.6335079   0.6526087   0.4148148   0.6229583   0.6475156   0.6570660 
      gmean 
  0.6615233 
test_rf$importance
                           all Yes  0
Ethnicity         0.0006639681  NA NA
Age              -0.0117103332  NA NA
Gender           -0.0065929427  NA NA
Religion         -0.0074821532  NA NA
Employment       -0.0066514213  NA NA
Income_median    -0.0049266321  NA NA
EnglishSpeak      0.0000184808  NA NA
EnglishDiff      -0.0070078933  NA NA
Health.Insurance  0.0060148425  NA NA
Dental.Insurance  0.0418136484  NA NA
Discrimination   -0.0016691434  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Urgent Care

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2017
           Frequency of class labels: 1673, 344
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 401.601
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1275
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 4.8634
                   (OOB) Brier score: 0.15698872
        (OOB) Normalized Brier score: 0.62795486
                           (OOB) AUC: 0.54438396
                        (OOB) PR-AUC: 0.19617836
                        (OOB) G-mean: 0.53928632
   (OOB) Requested performance error: 0.46071368

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   951 722      0.4316
       Yes 168 176      0.4884

      (OOB) Misclassification rate: 0.4412494
plot(imb,plots.one.page = F)


                       all    0   Yes
Discrimination      0.0190   NA    NA
Ethnicity           0.0077   NA    NA
Dental.Insurance    0.0074   NA    NA
EnglishDiff        -0.0018   NA    NA
Gender             -0.0024   NA    NA
Employment         -0.0034   NA    NA
Health.Insurance   -0.0042   NA    NA
EnglishSpeak       -0.0043   NA    NA
Age                -0.0044   NA    NA
Religion           -0.0058   NA    NA
Income_median      -0.0066   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1673.0000000  344.0000000    4.8633721    0.1705503    0.5116279    0.5684399 
        prec          npv     misclass        brier   brier.norm          auc 
   0.1959911    0.8498660    0.4412494    0.1569887    0.6279549    0.5443840 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.2834138    0.4002932    0.1705503    0.1961784    0.4113501    0.4697898 
       gmean 
   0.5392863 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1615
           Frequency of class labels: 793, 822
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 339.0257
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1021
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0366
                   (OOB) Brier score: 0.15016519
        (OOB) Normalized Brier score: 0.60066076
                           (OOB) AUC: 0.87896681
                        (OOB) PR-AUC: 0.85969945
                        (OOB) G-mean: 0.8098881
   (OOB) Requested performance error: 0.1901119

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   626 167      0.2106
       Yes 139 683      0.1691

      (OOB) Misclassification rate: 0.1894737
print(rfobj)
                         Sample size: 1615
           Frequency of class labels: 793, 822
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 339.0257
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1021
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0366
                   (OOB) Brier score: 0.15016519
        (OOB) Normalized Brier score: 0.60066076
                           (OOB) AUC: 0.87896681
                        (OOB) PR-AUC: 0.85969945
                        (OOB) G-mean: 0.8098881
   (OOB) Requested performance error: 0.1901119

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   626 167      0.2106
       Yes 139 683      0.1691

      (OOB) Misclassification rate: 0.1894737
plot(rfobj,plots.one.page = FALSE)


                      all    0   Yes
Religion           0.0714   NA    NA
Ethnicity          0.0565   NA    NA
EnglishDiff        0.0558   NA    NA
EnglishSpeak       0.0378   NA    NA
Age                0.0365   NA    NA
Dental.Insurance   0.0292   NA    NA
Discrimination     0.0290   NA    NA
Gender             0.0259   NA    NA
Income_median      0.0210   NA    NA
Health.Insurance   0.0125   NA    NA
Employment         0.0107   NA    NA
rfobj$importance
                        all  0 Yes
Ethnicity        0.05649963 NA  NA
Age              0.03654373 NA  NA
Gender           0.02594001 NA  NA
Religion         0.07144963 NA  NA
Employment       0.01074328 NA  NA
Income_median    0.02095404 NA  NA
EnglishSpeak     0.03780840 NA  NA
EnglishDiff      0.05581575 NA  NA
Health.Insurance 0.01245267 NA  NA
Dental.Insurance 0.02916377 NA  NA
Discrimination   0.02901248 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
334.0000000  68.0000000   4.9117647   0.1691542   0.9852941   0.1197605 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1855956   0.9756098   0.7338308   0.2004005   0.8016018   0.6097878 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.3123543   0.2535178   0.1691542   0.2236047   0.3279323   0.2985140 
      gmean 
  0.3435103 
test_rf$importance
                           all  0 Yes
Ethnicity         0.0556660407 NA  NA
Age               0.0011158844 NA  NA
Gender           -0.0003034483 NA  NA
Religion          0.0345407523 NA  NA
Employment       -0.0098904155 NA  NA
Income_median     0.0063793389 NA  NA
EnglishSpeak      0.0199196678 NA  NA
EnglishDiff       0.0498507656 NA  NA
Health.Insurance  0.0001078272 NA  NA
Dental.Insurance  0.0104565421 NA  NA
Discrimination    0.0075008564 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2012
           Frequency of class labels: 1735, 277
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 323.723
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1272
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.2635
                   (OOB) Brier score: 0.11478476
        (OOB) Normalized Brier score: 0.45913903
                           (OOB) AUC: 0.68994476
                        (OOB) PR-AUC: 0.26393605
                        (OOB) G-mean: 0.65278483
   (OOB) Requested performance error: 0.34721517

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0   1107 628      0.3620
       Yes   92 185      0.3321

      (OOB) Misclassification rate: 0.3578529
plot(imb,plots.one.page = F)


                       all    0   Yes
Age                 0.0456   NA    NA
Ethnicity           0.0447   NA    NA
EnglishSpeak        0.0345   NA    NA
Employment          0.0166   NA    NA
Discrimination      0.0141   NA    NA
Income_median       0.0133   NA    NA
EnglishDiff         0.0100   NA    NA
Gender              0.0060   NA    NA
Religion            0.0014   NA    NA
Health.Insurance    0.0008   NA    NA
Dental.Insurance   -0.0015   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1735.0000000  277.0000000    6.2635379    0.1376740    0.6678700    0.6380403 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2275523    0.9232694    0.3578529    0.1147848    0.4591390    0.6899448 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3394495    0.4682581    0.1376740    0.2639361    0.4961172    0.5605214 
       gmean 
   0.6527848 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1610
           Frequency of class labels: 791, 819
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 303.209
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1018
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0354
                   (OOB) Brier score: 0.12083947
        (OOB) Normalized Brier score: 0.48335789
                           (OOB) AUC: 0.92561154
                        (OOB) PR-AUC: 0.9239028
                        (OOB) G-mean: 0.84214075
   (OOB) Requested performance error: 0.15785925

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   639 152      0.1922
       Yes 100 719      0.1221

      (OOB) Misclassification rate: 0.1565217
print(rfobj)
                         Sample size: 1610
           Frequency of class labels: 791, 819
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 303.209
No. of variables tried at each split: 4
              Total no. of variables: 11
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1018
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0354
                   (OOB) Brier score: 0.12083947
        (OOB) Normalized Brier score: 0.48335789
                           (OOB) AUC: 0.92561154
                        (OOB) PR-AUC: 0.9239028
                        (OOB) G-mean: 0.84214075
   (OOB) Requested performance error: 0.15785925

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   639 152      0.1922
       Yes 100 719      0.1221

      (OOB) Misclassification rate: 0.1565217
plot(rfobj,plots.one.page = FALSE)


                      all    0   Yes
Religion           0.0758   NA    NA
Ethnicity          0.0732   NA    NA
EnglishDiff        0.0558   NA    NA
EnglishSpeak       0.0460   NA    NA
Dental.Insurance   0.0311   NA    NA
Age                0.0266   NA    NA
Gender             0.0237   NA    NA
Employment         0.0167   NA    NA
Income_median      0.0125   NA    NA
Health.Insurance   0.0097   NA    NA
Discrimination     0.0091   NA    NA
rfobj$importance
                         all  0 Yes
Ethnicity        0.073217837 NA  NA
Age              0.026637303 NA  NA
Gender           0.023723749 NA  NA
Religion         0.075761540 NA  NA
Employment       0.016746624 NA  NA
Income_median    0.012502924 NA  NA
EnglishSpeak     0.046027137 NA  NA
EnglishDiff      0.055805156 NA  NA
Health.Insurance 0.009674993 NA  NA
Dental.Insurance 0.031051997 NA  NA
Discrimination   0.009053117 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
347.0000000  55.0000000   6.3090909   0.1368159   0.9272727   0.2017291 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1554878   0.9459459   0.6990050   0.1844695   0.7378781   0.5884202 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.2663185   0.2957685   0.1368159   0.1881303   0.3494102   0.3641352 
      gmean 
  0.4325019 
test_rf$importance
                           all  0 Yes
Ethnicity         0.0238138463 NA  NA
Age              -0.0017784139 NA  NA
Gender           -0.0002290971 NA  NA
Religion         -0.0149806420 NA  NA
Employment        0.0139455879 NA  NA
Income_median     0.0059245342 NA  NA
EnglishSpeak      0.0110700102 NA  NA
EnglishDiff       0.0092463758 NA  NA
Health.Insurance  0.0055529282 NA  NA
Dental.Insurance -0.0191544039 NA  NA
Discrimination   -0.0149694157 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot